set.seed(2021)
library(MouseGastrulationData)
library(ggplot2)
library(reshape)
library(org.Mm.eg.db)
library(scater)
library(patchwork)
library(gtools)
Load functions
source("../scripts/StabMap_functions.R")
mt = MouseGastrulationData::AtlasSampleMetadata
samples = mt$sample[mt$stage %in% c("E8.5")]
SCE <- EmbryoAtlasData(samples = samples)
SCE
## class: SingleCellExperiment
## dim: 29452 20978
## metadata(0):
## assays(1): counts
## rownames(29452): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ENSEMBL SYMBOL
## colnames(20978): cell_36865 cell_36866 ... cell_139330 cell_139331
## colData names(17): cell barcode ... colour sizeFactor
## reducedDimNames(2): pca.corrected umap
## altExpNames(0):
SCE <- logNormCounts(SCE)
celltype_colours = MouseGastrulationData::EmbryoCelltypeColours
celltype_colours
## Epiblast Primitive Streak
## "#635547" "#DABE99"
## Caudal epiblast PGC
## "#9E6762" "#FACB12"
## Anterior Primitive Streak Notochord
## "#C19F70" "#0F4A9C"
## Def. endoderm Gut
## "#F397C0" "#EF5A9D"
## Nascent mesoderm Mixed mesoderm
## "#C594BF" "#DFCDE4"
## Intermediate mesoderm Caudal Mesoderm
## "#139992" "#3F84AA"
## Paraxial mesoderm Somitic mesoderm
## "#8DB5CE" "#005579"
## Pharyngeal mesoderm Cardiomyocytes
## "#C9EBFB" "#B51D8D"
## Allantois ExE mesoderm
## "#532C8A" "#8870AD"
## Mesenchyme Haematoendothelial progenitors
## "#CC7818" "#FBBE92"
## Endothelium Blood progenitors 1
## "#FF891C" "#F9DECF"
## Blood progenitors 2 Erythroid1
## "#C9A997" "#C72228"
## Erythroid2 Erythroid3
## "#F79083" "#EF4E22"
## NMP Rostral neurectoderm
## "#8EC792" "#65A83E"
## Caudal neurectoderm Neural crest
## "#354E23" "#C3C388"
## Forebrain/Midbrain/Hindbrain Spinal cord
## "#647A4F" "#CDE088"
## Surface ectoderm Visceral endoderm
## "#F7F79E" "#F6BFCB"
## ExE endoderm ExE ectoderm
## "#7F6874" "#989898"
## Parietal endoderm
## "#1A1A1A"
Randomly subset 5000 cells from this data, for speed reasons
cells_thin = sample(colnames(SCE), 5000)
SCE_thin = SCE[,cells_thin]
SCE_thin
## class: SingleCellExperiment
## dim: 29452 5000
## metadata(0):
## assays(2): counts logcounts
## rownames(29452): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ENSEMBL SYMBOL
## colnames(5000): cell_136951 cell_132095 ... cell_39971 cell_96348
## colData names(17): cell barcode ... colour sizeFactor
## reducedDimNames(2): pca.corrected umap
## altExpNames(0):
geneList_SCMER = sapply(mixedsort(list.files("../data/SCMER/", pattern = "nGenes", full.names = TRUE)),
scan, what = "character", simplify = FALSE)
lapply(geneList_SCMER, function(x) all(x %in% rownames(SCE)))
## $`../data/SCMER//nGenes_18.txt`
## [1] TRUE
##
## $`../data/SCMER//nGenes_32.txt`
## [1] TRUE
##
## $`../data/SCMER//nGenes_33.txt`
## [1] TRUE
##
## $`../data/SCMER//nGenes_50.txt`
## [1] TRUE
##
## $`../data/SCMER//nGenes_53.txt`
## [1] TRUE
##
## $`../data/SCMER//nGenes_75.txt`
## [1] TRUE
##
## $`../data/SCMER//nGenes_117.txt`
## [1] TRUE
##
## $`../data/SCMER//nGenes_137.txt`
## [1] TRUE
##
## $`../data/SCMER//nGenes_153.txt`
## [1] TRUE
##
## $`../data/SCMER//nGenes_257.txt`
## [1] TRUE
geneList_PC_ranked_symbols = read.csv("../data/SCMER/selection_spearman.csv", header = TRUE, row.names = 1)[,3]
geneList_PC_ranked = rowData(SCE)$ENSEMBL[match(geneList_PC_ranked_symbols, rowData(SCE)$SYMBOL)]
geneList_PC = lapply(geneList_SCMER, function(x) geneList_PC_ranked[1:length(x)])
This will be re-used multiple times. Note this is a dense matrix, as output from igraph::similarity.
full_sim = generateSimilarity(SCE_thin, batchFactor = SCE_thin$sample)
dim(full_sim)
## [1] 5000 5000
full_sim[1:5,1:5]
## cell_136951 cell_132095 cell_100178 cell_131549 cell_100447
## cell_136951 1.00000000 0 0.000000000 0.05479452 0.000000000
## cell_132095 0.00000000 1 0.000000000 0.00000000 0.000000000
## cell_100178 0.00000000 0 1.000000000 0.00000000 0.004761905
## cell_131549 0.05479452 0 0.000000000 1.00000000 0.000000000
## cell_100447 0.00000000 0 0.004761905 0.00000000 1.000000000
plotAdditional = list("celltype",
list(scale_colour_manual(values = celltype_colours),
theme(legend.position = "bottom")))
scores_SCMER = lapply(geneList_SCMER, function(geneList) {
getSubsetUncertainty(SCE_thin,
subsetGenes = geneList,
full_sim = full_sim,
plot = TRUE,
plotAdditional = plotAdditional)
})
scores_PC = lapply(geneList_PC, function(geneList) {
getSubsetUncertainty(SCE_thin,
subsetGenes = geneList,
full_sim = full_sim,
plot = TRUE,
plotAdditional = plotAdditional)
})
scores_df = data.frame(cell = colnames(SCE_thin),
score = scores_SCMER[[10]],
celltype = SCE_thin$celltype)
ggplot(scores_df, aes(x = celltype, y = score)) +
geom_boxplot(aes(fill = celltype)) +
theme_classic() +
scale_fill_manual(values = celltype_colours) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("") +
theme(legend.position = "bottom") +
NULL
I can also estimate the uncertainty for the full data, given the subset. This means that I can estimate the uncertainty for all cells, but only pull from the 5000x5000 similarity distribution from the random subset of cells. This is of course an estimate, but it’s worthwhile knowing one can circumvent needing to calculate the similarity across all cells.
joint_sample = SCE$sample
names(joint_sample) <- colnames(SCE)
scores_all_SCMER = lapply(geneList_SCMER, function(geneList) {
getSubsetUncertainty(SCE_thin,
querySCE = SCE[, setdiff(colnames(SCE), colnames(SCE_thin))],
subsetGenes = geneList,
full_sim = full_sim,
plot = TRUE,
plotAdditional = plotAdditional,
jointBatchFactor = joint_sample)
})
scores_all_PC = lapply(geneList_PC, function(geneList) {
print(length(geneList))
getSubsetUncertainty(SCE_thin,
querySCE = SCE[, setdiff(colnames(SCE), colnames(SCE_thin))],
subsetGenes = geneList,
full_sim = full_sim,
plot = TRUE,
plotAdditional = plotAdditional,
jointBatchFactor = joint_sample)
})
## [1] 18
## [1] 32
## [1] 33
## [1] 50
## [1] 53
## [1] 75
## [1] 117
## [1] 137
## [1] 153
## [1] 257
Examine these, first by combining into a single data frame
scores_SCMER_df = do.call(rbind, sapply(names(scores_SCMER), function(score_name) {
scores = scores_all_SCMER[[score_name]]
data.frame(cell = names(scores),
score = scores,
celltype = SCE[,names(scores)]$celltype,
sample = SCE[,names(scores)]$sample,
score_n = score_name,
type = "SCMER")
}, simplify = FALSE))
scores_PC_df = do.call(rbind, sapply(names(scores_PC), function(score_name) {
scores = scores_all_PC[[score_name]]
data.frame(cell = names(scores),
score = scores,
celltype = SCE[,names(scores)]$celltype,
sample = SCE[,names(scores)]$sample,
score_n = score_name,
type = "PC")
}, simplify = FALSE))
scores_all_df = rbind(scores_SCMER_df, scores_PC_df)
scores_all_df$score_n <- factor(scores_all_df$score_n, levels = mixedsort(unique(scores_all_df$score_n)))
dim(scores_all_df)
## [1] 419560 6
head(scores_all_df)
## cell score
## ../data/SCMER//nGenes_18.txt.cell_136951 cell_136951 0.09469388
## ../data/SCMER//nGenes_18.txt.cell_132095 cell_132095 0.31591837
## ../data/SCMER//nGenes_18.txt.cell_100178 cell_100178 0.38285714
## ../data/SCMER//nGenes_18.txt.cell_131549 cell_131549 0.61551020
## ../data/SCMER//nGenes_18.txt.cell_100447 cell_100447 0.40897959
## ../data/SCMER//nGenes_18.txt.cell_135831 cell_135831 0.20653061
## celltype sample
## ../data/SCMER//nGenes_18.txt.cell_136951 Erythroid2 37
## ../data/SCMER//nGenes_18.txt.cell_132095 Erythroid3 36
## ../data/SCMER//nGenes_18.txt.cell_100178 Paraxial mesoderm 29
## ../data/SCMER//nGenes_18.txt.cell_131549 Erythroid3 36
## ../data/SCMER//nGenes_18.txt.cell_100447 Pharyngeal mesoderm 29
## ../data/SCMER//nGenes_18.txt.cell_135831 Erythroid3 37
## score_n type
## ../data/SCMER//nGenes_18.txt.cell_136951 ../data/SCMER//nGenes_18.txt SCMER
## ../data/SCMER//nGenes_18.txt.cell_132095 ../data/SCMER//nGenes_18.txt SCMER
## ../data/SCMER//nGenes_18.txt.cell_100178 ../data/SCMER//nGenes_18.txt SCMER
## ../data/SCMER//nGenes_18.txt.cell_131549 ../data/SCMER//nGenes_18.txt SCMER
## ../data/SCMER//nGenes_18.txt.cell_100447 ../data/SCMER//nGenes_18.txt SCMER
## ../data/SCMER//nGenes_18.txt.cell_135831 ../data/SCMER//nGenes_18.txt SCMER
ggplot(scores_all_df, aes(x = interaction(type, score_n), y = score)) +
geom_boxplot(aes(fill = type)) +
theme_classic() +
# scale_fill_manual(values = celltype_colours) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("") +
theme(legend.position = "bottom") +
NULL
ggplot(subset(scores_all_df, score_n == "../data/SCMER//nGenes_257.txt"),
aes(x = interaction(type, celltype), y = score)) +
geom_boxplot(aes(fill = celltype)) +
theme_classic() +
scale_fill_manual(values = celltype_colours) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("") +
theme(legend.position = "bottom") +
NULL
Here I perform a clustering of the data using the best SCMER genes, extract the clusters with the highest uncertainty values, then re-cluster these cells using the entire transcriptome. Presumably, the markers for these re-clusters represent the gene expression differences that are as yet unaccounted for within the SCMER gene set.
SCE_sub <- logNormCounts(SCE[geneList_SCMER[[length(geneList_SCMER)]],])
fit = modelGeneVar(logcounts(SCE_sub))
HVGs = getTopHVGs(fit)
length(HVGs)
## [1] 120
SCE_sub <- runPCA(SCE_sub, subset_row = rownames(SCE_sub))
SCE_sub_corrected <- fastMNN(SCE_sub, batch = SCE_sub$sample)
PCs_sub = reducedDim(SCE_sub_corrected, "corrected")
reducedDim(SCE_sub, "UMAP") <- calculateUMAP(t(PCs_sub))
cl = clusterSNNGraph(SCE_sub_corrected, use.dimred = "corrected")
table(cl)
## cl
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1290 265 403 1375 807 3095 981 2738 563 796 734 471 200 123 111 657
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 163 548 67 141 207 24 863 806 40 132 843 605 118 845 288 97
## 33 34 35 36 37
## 27 45 245 249 16
SCE_sub$cluster_SCMER = paste0("SCMER_cluster_", cl)
SCE_sub$score = scores_all_SCMER[[length(scores_all_SCMER)]][colnames(SCE_sub)]
plotUMAP(SCE_sub, colour_by = "cluster_SCMER") + scale_colour_discrete() + ggtitle("Clusters using SCMER genes")
plotUMAP(SCE_sub, colour_by = "score")
plotUMAP(SCE_sub, colour_by = "celltype") + scale_colour_manual(values = celltype_colours)
ggplot(as.data.frame(colData(SCE_sub)), aes(x = cluster_SCMER, y = score)) +
geom_boxplot(aes(fill = cluster_SCMER)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
NULL
There are clusters that have a high uncertainty value, while others remain fairly low. This means that for these clusters, there may be additional genes that are needed to better estimate the cell-cell neighbourhoods.
Re-cluster the cells within these high uncertainty clusters using all genes, and extract their marker genes. Visualise these on the full UMAP and examine what genes they are.
clusters_ordered_uncertainty = names(sort(tapply(SCE_sub$score, SCE_sub$cluster_SCMER, median), decreasing = TRUE))
clusters_ordered_uncertainty
## [1] "SCMER_cluster_2" "SCMER_cluster_13" "SCMER_cluster_35" "SCMER_cluster_7"
## [5] "SCMER_cluster_17" "SCMER_cluster_24" "SCMER_cluster_6" "SCMER_cluster_3"
## [9] "SCMER_cluster_22" "SCMER_cluster_33" "SCMER_cluster_28" "SCMER_cluster_27"
## [13] "SCMER_cluster_21" "SCMER_cluster_14" "SCMER_cluster_26" "SCMER_cluster_4"
## [17] "SCMER_cluster_11" "SCMER_cluster_23" "SCMER_cluster_31" "SCMER_cluster_29"
## [21] "SCMER_cluster_20" "SCMER_cluster_34" "SCMER_cluster_8" "SCMER_cluster_1"
## [25] "SCMER_cluster_10" "SCMER_cluster_16" "SCMER_cluster_30" "SCMER_cluster_5"
## [29] "SCMER_cluster_36" "SCMER_cluster_19" "SCMER_cluster_25" "SCMER_cluster_9"
## [33] "SCMER_cluster_18" "SCMER_cluster_12" "SCMER_cluster_32" "SCMER_cluster_37"
## [37] "SCMER_cluster_15"
minFDRs_list = list()
for (i in 1:5) {
print(i)
SCE_sub$cluster_SCMER_tmp = SCE_sub$cluster_SCMER == clusters_ordered_uncertainty[i]
g = plotUMAP(SCE_sub, colour_by = "cluster_SCMER_tmp") +
scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "grey"), na.value = "grey") +
ggtitle(paste("Clusters using SCMER genes on SCMER genes UMAP, ",
clusters_ordered_uncertainty[i]))
print(g)
g = plotReducedDim(SCE_sub, dimred = "umap", colour_by = "cluster_SCMER_tmp") +
scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "grey"), na.value = "grey") +
ggtitle(paste("Clusters using SCMER genes on full data UMAP, ",
clusters_ordered_uncertainty[i])) +
coord_fixed()
print(g)
table(SCE_sub$celltype, SCE_sub$cluster_SCMER_tmp)
# cluster within these cells
SCE_full = SCE[,colnames(SCE_sub)[SCE_sub$cluster_SCMER_tmp]]
SCE_full <- logNormCounts(SCE_full)
fit = modelGeneVar(logcounts(SCE_full), block = SCE_full$sample)
HVGs = getTopHVGs(fit)
length(HVGs)
SCE_full <- runPCA(SCE_full, subset_row = HVGs)
SCE_full_corrected <- fastMNN(SCE_full, batch = SCE_full$sample)
SCE_full_clusters = clusterSNNGraph(SCE_full_corrected, use.dimred = "corrected")
table(SCE_full_clusters)
SCE_full$full_cluster = SCE_full_clusters
full_cluster_markers_array = getMarkerArray(SCE_full, "full_cluster")
minFDRs = rowMins(full_cluster_markers_array[,,"FDR"], na.rm = TRUE)
names(minFDRs) <- dimnames(full_cluster_markers_array)[[1]]
minFDRs_list[[i]] <- minFDRs
sort(minFDRs)[1:5]
table(minFDRs < 0.01)
head(minFDRs[minFDRs < 0.01])
rowData(SCE_full)$SYMBOL[order(minFDRs)[1:5]]
gList = sapply(rowData(SCE_full)$SYMBOL[order(minFDRs)[1:9]], function(gene) {
plotReducedDim(SCE, dimred = "umap",
colour_by = gene,
swap_rownames = "SYMBOL",
by_exprs_values = "logcounts") +
coord_fixed()
}, simplify = FALSE)
g = wrap_plots(gList, nrow = 3, ncol = 3)
print(g)
}
## [1] 1
## [1] "1"
## [1] "2"
## [1] "3"
## [1] 2
## [1] "1"
## [1] "2"
## [1] "3"
## [1] 3
## [1] "1"
## [1] "2"
## [1] "3"
## [1] 4
## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] "7"
## [1] "8"
## [1] 5
## [1] "1"
## [1] "2"
## [1] "3"
sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-apple-darwin19.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /usr/local/Cellar/openblas/0.3.13/lib/libopenblasp-r0.3.13.dylib
## LAPACK: /usr/local/Cellar/r/4.0.4/lib/R/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] BiocNeighbors_1.8.2 batchelor_1.6.2
## [3] igraph_1.2.6 bluster_1.0.0
## [5] scran_1.18.3 gtools_3.8.2
## [7] patchwork_1.1.1 scater_1.18.3
## [9] org.Mm.eg.db_3.12.0 AnnotationDbi_1.52.0
## [11] reshape_0.8.8 ggplot2_3.3.3
## [13] MouseGastrulationData_1.4.0 SingleCellExperiment_1.12.0
## [15] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [17] GenomicRanges_1.42.0 GenomeInfoDb_1.26.2
## [19] IRanges_2.24.1 S4Vectors_0.28.1
## [21] BiocGenerics_0.36.0 MatrixGenerics_1.2.0
## [23] matrixStats_0.57.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 bit64_4.0.5
## [3] RcppAnnoy_0.0.18 httr_1.4.2
## [5] tools_4.0.4 ResidualMatrix_1.0.0
## [7] R6_2.5.0 irlba_2.3.3
## [9] vipor_0.4.5 uwot_0.1.10
## [11] DBI_1.1.1 colorspace_2.0-0
## [13] withr_2.4.1 tidyselect_1.1.0
## [15] gridExtra_2.3 bit_4.0.4
## [17] curl_4.3 compiler_4.0.4
## [19] DelayedArray_0.16.1 labeling_0.4.2
## [21] scales_1.1.1 rappdirs_0.3.3
## [23] stringr_1.4.0 digest_0.6.27
## [25] rmarkdown_2.6 XVector_0.30.0
## [27] pkgconfig_2.0.3 htmltools_0.5.1.1
## [29] sparseMatrixStats_1.2.0 limma_3.46.0
## [31] dbplyr_2.1.0 fastmap_1.1.0
## [33] rlang_0.4.10 RSQLite_2.2.3
## [35] shiny_1.6.0 DelayedMatrixStats_1.12.2
## [37] farver_2.0.3 generics_0.1.0
## [39] BiocParallel_1.24.1 dplyr_1.0.3
## [41] RCurl_1.98-1.2 magrittr_2.0.1
## [43] BiocSingular_1.6.0 GenomeInfoDbData_1.2.4
## [45] scuttle_1.0.4 Matrix_1.3-2
## [47] Rcpp_1.0.6 ggbeeswarm_0.6.0
## [49] munsell_0.5.0 viridis_0.5.1
## [51] lifecycle_0.2.0 edgeR_3.32.1
## [53] stringi_1.5.3 yaml_2.2.1
## [55] zlibbioc_1.36.0 plyr_1.8.6
## [57] BiocFileCache_1.14.0 AnnotationHub_2.22.0
## [59] grid_4.0.4 blob_1.2.1
## [61] dqrng_0.2.1 promises_1.1.1
## [63] ExperimentHub_1.16.0 crayon_1.3.4
## [65] lattice_0.20-41 cowplot_1.1.1
## [67] beachmat_2.6.4 locfit_1.5-9.4
## [69] knitr_1.30 pillar_1.4.7
## [71] codetools_0.2-18 glue_1.4.2
## [73] BiocVersion_3.12.0 evaluate_0.14
## [75] BiocManager_1.30.10 vctrs_0.3.6
## [77] httpuv_1.5.5 gtable_0.3.0
## [79] purrr_0.3.4 assertthat_0.2.1
## [81] cachem_1.0.1 xfun_0.20
## [83] rsvd_1.0.3 mime_0.9
## [85] xtable_1.8-4 RSpectra_0.16-0
## [87] later_1.1.0.1 viridisLite_0.3.0
## [89] tibble_3.0.5 beeswarm_0.2.3
## [91] memoise_2.0.0 statmod_1.4.35
## [93] ellipsis_0.3.1 interactiveDisplayBase_1.28.0